# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
library(summarytools)
## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(ggcorrplot)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(viridis)
## Loading required package: viridisLite
library(readr)
library(treemap)

# Load the datasets
estimated_numbers <- read_csv("estimated_numbers.csv")
## Rows: 856 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country, No. of cases, No. of deaths, WHO Region
## dbl (7): Year, No. of cases_median, No. of cases_min, No. of cases_max, No. ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
incidence_per_1000_pop_at_risk <- read_csv("incidence_per_1000_pop_at_risk.csv")
## Rows: 2033 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, WHO Region
## dbl (2): Year, Incidence per 1000 population at risk
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reported_numbers <- read_csv("reported_numbers.csv")
## Rows: 1944 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, WHO Region
## dbl (3): Year, No. of cases, No. of deaths
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load geospatial data (world map)
world <- st_read("ne_10m_admin_0_countries.shp")
## Reading layer `ne_10m_admin_0_countries' from data source 
##   `C:\Users\sputhuma\Documents\HJF\ne_10m_admin_0_countries.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 258 features and 168 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## Geodetic CRS:  WGS 84
reported_numbers <- reported_numbers %>%
  rename(NAME = `NAME`, `Reported Cases` = `No. of cases`, `Reported Deaths` = `No. of deaths`)



# Merge the malaria data with the geospatial data
malaria_map_data <- left_join(world, reported_numbers %>% filter(Year == max(Year)), by = "NAME")



# Check if 'Reported Cases' column exists
if(!"Reported Cases" %in% names(malaria_map_data)) {
  stop("Column 'Reported Cases' does not exist in the merged data.")
}


# Handle any NAs in the merged data
malaria_map_data <- malaria_map_data %>% drop_na(`Reported Cases`)

# Fix invalid geometries
malaria_map_data <- st_make_valid(malaria_map_data)

# Visualize the map using tmap
tmap_options(check.and.fix = TRUE)

tm_shape(malaria_map_data) +
  tm_polygons("Reported Cases", style = "quantile", palette = "viridis", title = "Reported Malaria Cases") +
  tm_layout(title = "Global Distribution of Reported Malaria Cases",
            title.position = c("left", "top"),
            legend.outside = TRUE)

# ggplot2 for geospatial visualization
ggplot(data = malaria_map_data) +
  geom_sf(aes(fill = `Reported Cases`)) +
  scale_fill_viridis_c() +
  labs(title = "Global Distribution of Reported Malaria Cases",
       fill = "Reported Cases") +
  theme_minimal()

# Interactive map with tmap
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(malaria_map_data) +
  tm_polygons("Reported Cases", style = "quantile", palette = "viridis", title = "Reported Malaria Cases", 
              id = "NAME", popup.vars = c("Country Name" = "NAME", "Reported Cases" = "Reported Cases")) +
  tm_layout(title = "Global Distribution of Reported Malaria Cases")
# Insight 1: Top regions affected
top_regions <- malaria_map_data %>%
  st_drop_geometry() %>%
  arrange(desc(`Reported Cases`)) %>%
  head(10)





# Rename columns in incidence_per_1000_pop_at_risk to match those in world
incidence_per_1000_pop_at_risk <- incidence_per_1000_pop_at_risk %>%
  rename(NAME = `NAME`, `Incidence per 1000 population` = `Incidence per 1000 population at risk`)

# Merge with the world data
incidence_map_data <- left_join(world, incidence_per_1000_pop_at_risk %>% filter(Year == max(Year)), by = "NAME")



# Handle any NAs in the merged data
incidence_map_data <- incidence_map_data %>% drop_na(`Incidence per 1000 population`)

# Visualize incidence data
tm_shape(incidence_map_data) +
  tm_polygons("Incidence per 1000 population", style = "quantile", palette = "magma", title = "Malaria Incidence per 1000 Population at Risk",
              id = "NAME", popup.vars = c("Country Name" = "NAME", "Incidence per 1000 Population" = "Incidence per 1000 population")) +
  tm_layout(title = "Global Malaria Incidence",
            title.position = c("left", "top"),
            legend.outside = TRUE)
## Warning: The shape incidence_map_data is invalid. See sf::st_is_valid
# Insight 3: Comparison of cases and deaths
case_death_comparison <- reported_numbers %>%
  filter(Year == max(Year)) %>%
  select(NAME, `Reported Cases`, `Reported Deaths`) %>%
  arrange(desc(`Reported Cases`))


# Filter top 50 countries by reported cases and deaths
top_50_cases <- case_death_comparison %>%
  arrange(desc(`Reported Cases`)) %>%
  head(50)

top_50_deaths <- case_death_comparison %>%
  arrange(desc(`Reported Deaths`)) %>%
  head(50)


# Create a data frame for plotting
cases_data <- top_50_cases %>%
  mutate(text = paste("Country: ", NAME, "<br>Reported Cases: ", `Reported Cases`))

deaths_data <- top_50_deaths %>%
  mutate(text = paste("Country: ", NAME, "<br>Reported Deaths: ", `Reported Deaths`))



# Interactive treemap for Reported Cases with unique colors for each country
plot_cases <- plot_ly(
  data = cases_data,
  type = "treemap",
  ids = ~NAME,
  labels = ~NAME,
  parents = "",
  values = ~`Reported Cases`,
  textinfo = "label+value+percent entry",
  hoverinfo = "text",
  text = ~text,
  marker = list(colors = ~NAME, colorscale = "Set3", showscale = FALSE)
) %>%
  layout(title = "Top 50 Countries with Reported Malaria Cases (2000-2018)")




# Interactive treemap for Reported Deaths
plot_deaths <- plot_ly(
  data = deaths_data,
  type = "treemap",
  ids = ~NAME,
  labels = ~NAME,
  parents = "",
  values = ~`Reported Deaths`,
  textinfo = "label+value+percent entry",
  hoverinfo = "text",
  text = ~text,
  marker = list(colors = ~`Reported Deaths`, colorscale = "Reds", showscale = TRUE)
) %>%
  layout(title = "Top 50 Countries with Reported Malaria Deaths (2000-2018)")

# Print the interactive plots
plot_cases
plot_deaths